{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "335a5de6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys  \n",
    "import json \n",
    "import copy\n",
    "import torch\n",
    "import argparse\n",
    "import numpy as np\n",
    "from PIL import Image \n",
    "from tqdm import tqdm \n",
    "from utils import model_gen, load_jsonl\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer  \n",
    "  \n",
    "ckpt_path = 'internlm/internlm-xcomposer2-vl-7b'\n",
    "tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=\"cuda\", trust_remote_code=True).eval().cuda().half()\n",
    "model.tokenizer = tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "733859a2",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "meta_instruction = \"\"\"You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n",
    "- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n",
    "- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen by the user such as English and 中文.\n",
    "- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively based on the provided image.\"\"\"\n",
    "\n",
    "samples = json.load(open('data/mm-vet/mm-vet.json', 'r'))\n",
    "\n",
    "count = 0\n",
    "results = {}\n",
    "for k, v in tqdm(samples.items()):\n",
    "    im_path = 'data/mm-vet/images/'+v['imagename'] \n",
    "    text = '[UNUSED_TOKEN_146]system\\n{}[UNUSED_TOKEN_145]\\n[UNUSED_TOKEN_146]user\\n{}Answer this question in detail.[UNUSED_TOKEN_145]\\n[UNUSED_TOKEN_146]assistant\\n'.format(meta_instruction, v['question'])\n",
    "    with torch.cuda.amp.autocast(): \n",
    "        out = model_gen(model, text, im_path)\n",
    "     \n",
    "    results[k] = out\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46a8a54b",
   "metadata": {},
   "outputs": [],
   "source": [
    "json.dump(results, open('Output/MMVet_InternLM_XComposer_VL.json.json','w'), indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d4b366c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
